In [103]:
import numpy as np
import pandas as pd
import json
pd.set_option('display.max_columns',9999)
Test on 2012 data
In [448]:
# monthList = ['Apr', 'May']
monthList = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
In [449]:
# load metadata of April & May
# notice the encoding 'cp1252' (Western Europe)
# stores DataFrame of every month in monthList
dfList = []
for month in monthList:
dfList.append(pd.read_csv(r"E:\GoogleDrive\Projects\PERCEIVE\data\Full Disclosure\2012 - Copy\Full_Disclosure_Mailing_List_" + month + "2012.csv",
encoding = 'cp1252',
index_col= 0)
)
In [450]:
# initiate four elements of Tweets.js
tweet_id = None
author = []
tweet_date = []
text = []
# tweet_id
Len = 0
for month_ix in range(len(monthList)):
Len += len(dfList[month_ix])
tweet_id = list(range(1,Len+1))
# author
for month_ix in range(len(monthList)):
author += dfList[month_ix].author.apply(lambda x: x.replace('"','')).tolist()
# tweet_date
for month_ix in range(len(monthList)):
tweet_date += pd.to_datetime(dfList[month_ix].dateStamp).apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).tolist()
# text
for month_ix in range(len(monthList)):
M = dfList[month_ix]
for text_ix in range(len(M)):
ix = str(M['k'].values[text_ix]) # 'k' points to the name of the file
with open(r'E:\GoogleDrive\Projects\PERCEIVE\data\Full Disclosure\2012 - Copy\2012_' + monthList[month_ix] + '_' + ix + '.txt', 'r', encoding='cp1252') as textfile:
tmp = textfile.read().replace('"','').replace('http://','').replace('\\','').replace('\n','')
text.append(tmp)
# text - TEST VERSION
# for month_ix in range(len(monthList)):
# M = dfList[month_ix]
# for text_ix in range(len(M)):
# text.append('tmp')
There are a lot of characters or symbols that could cause problems
In [411]:
text[0]
Out[411]:
In [452]:
# transform into json format
# first, transform into pd.DataFrame
df_tmp = pd.DataFrame({'tweet_id':tweet_id, 'author':author, 'tweet_date': tweet_date, 'text': text},
columns=['tweet_id','author','tweet_date','text'],
index=tweet_id)
# then, transform into json format
json_tmp = df_tmp.to_json(orient='index')
# finally, transform into .js format that TopicFlow can read
prefix = 'function populate_tweets_test(){\nvar tweet_data ='
posfix = ';\nreadTweetJSON(tweet_data);\n}'
tweetsJS = prefix + json_tmp + posfix
In [453]:
def get_tweets(month_abre):
pass
In [454]:
# write
with open('Tweet.js', 'w') as file:
file.write(tweetsJS)
In [473]:
# initiate bins, each month is one bin, each bin is also a dictionary
binDict = {}
for month_ix in range(len(monthList)):
binDict[str(month_ix)] = {}
! May want to see if the order in the dict matters
In [474]:
# bin_id
for month_ix in range(len(monthList)):
binDict[str(month_ix)]['bin_id'] = month_ix
# tweet_ids
# needs input from dfList, specifically the lenth of each month
for month_ix in range(len(monthList)):
binDict[str(month_ix)]['tweet_Ids'] = []
lo,hi = 1,1
for month_ix in range(len(monthList)):
hi += len(dfList[month_ix])
for tweet_ix in range(lo,hi):
binDict[str(month_ix)]['tweet_Ids'].append(tweet_ix)
lo = hi
# start_time
# needs input from dfList, specifically the lenth of each month
for month_ix in range(len(monthList)):
binDict[str(month_ix)]['start_time'] = pd.to_datetime(dfList[month_ix].dateStamp).sort_values().apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).tolist()[0]
# end_time
# needs input from dfList, specifically the lenth of each month
for month_ix in range(len(monthList)):
binDict[str(month_ix)]['end_time'] = pd.to_datetime(dfList[month_ix].dateStamp).sort_values().apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).tolist()[-1]
# initiate topic_model
for month_ix in range(len(monthList)):
binDict[str(month_ix)]['topic_model'] = {}
# 4 sub dictionaries
binDict[str(month_ix)]['topic_model']['topic_doc'] = {}
binDict[str(month_ix)]['topic_model']['doc_topic'] = {}
binDict[str(month_ix)]['topic_model']['topic_word'] = {}
binDict[str(month_ix)]['topic_model']['topic_prob'] = {}
! notice that here len(Topic-Doc matrix) != len(previous Ids)
In [475]:
# read topic-doc & topic-word data sets
dfTopicDoc = []
for month in monthList:
dfTopicDoc.append(pd.read_csv(r"E:\GoogleDrive\Projects\PERCEIVE\data\LDA_VEM\2012_k_10_12\Document_topic_Matrix\\" + month + ".csv",
index_col= 0)
)
# read topic-word data sets
dfTopicWord = []
for month in monthList:
dfTopicWord.append(pd.read_csv(r"E:\GoogleDrive\Projects\PERCEIVE\data\LDA_VEM\2012_k_10_12\Topic_Term_Matrix\\" + month + ".csv",
index_col= 0)
)
In [476]:
# initiate topic_model
for month_ix in range(len(monthList)):
binDict[str(month_ix)]['topic_model'] = {}
# 4 sub dictionaries
binDict[str(month_ix)]['topic_model']['topic_doc'] = {}
binDict[str(month_ix)]['topic_model']['doc_topic'] = {}
binDict[str(month_ix)]['topic_model']['topic_word'] = {}
binDict[str(month_ix)]['topic_model']['topic_prob'] = {}
In [477]:
# to begin this section, create a DataFrame mapping Topic-Doc, the documents in the dfTopicDoc are not the same as in metadata
# pre-step 1, creates a list of the starting position of each month's tweet_id
month_start_tweetIds = []
Len = 0
for month_ix in range(len(monthList)):
month_start_tweetIds.append(Len)
Len += len(dfList[month_ix])
# pre-step 2, find the overlapping documents
for month_ix in range(len(monthList)):
Doc_dfTopicDoc = []
for i in dfTopicDoc[month_ix].index.values:
Doc_dfTopicDoc.append(int(i[13:-4]))
Overlap = set(Doc_dfTopicDoc) & set(dfList[month_ix]['k'].values)
# pre-step 3, create a DataFrame mapping the overlapping documents and 10 topics
Overlap_ix = []
ix_list = dfTopicDoc[month_ix].index.tolist()
for item in Overlap:
name = str(monthList[month_ix]) + '/2012_' + str(monthList[month_ix]) + '_' + str(item) + '.txt'
Overlap_ix.append(ix_list.index(name))
dfTopicDoc_Overlap = dfTopicDoc[month_ix].iloc[Overlap_ix, : ]
# pre-step 4, add tweet_ids to dfTopicDoc_Overlap
Overlap_tweetIds = []
for k in dfTopicDoc_Overlap.index.values:
name = int(k[13:-4])
name_ix = dfList[month_ix]['k'].tolist().index(name) + 1
name_ix += month_start_tweetIds[month_ix]
Overlap_tweetIds.append(name_ix)
dfTopicDoc_Overlap['tweet_id'] = Overlap_tweetIds
# topic_prob
# is there an order?
L = len(dfTopicDoc[month_ix].columns)
for ix in range(L):
T = str(month_ix) + '_' + str(ix+1)
binDict[str(month_ix)]['topic_model']['topic_prob'][str(ix)] = T
# topic_doc
# create 10 topic keys
for ix in range(L):
T = str(month_ix) + '_' + str(ix+1)
binDict[str(month_ix)]['topic_model']['topic_doc'][T] = {}
# add doc values to these keys
for ix_2 in range(L):
T = str(month_ix) + '_' + str(ix_2+1)
col_score = dfTopicDoc_Overlap[str(ix_2+1)].values
col_k = dfTopicDoc_Overlap['tweet_id'].values
for ix_3 in range(len(col_score)):
binDict[str(month_ix)]['topic_model']['topic_doc'][T][str(col_k[ix_3])] = col_score[ix_3]
# doc_topic
for ix_4 in range(len(dfTopicDoc_Overlap)):
row_score = dfTopicDoc_Overlap.iloc[ix_4,:]
binDict[str(month_ix)]['topic_model']['doc_topic'][ str(int(row_score['tweet_id'])) ] = {}
for ix_5 in range(L):
name = str(month_ix) + '_' + str(ix_5 + 1)
binDict[str(month_ix)]['topic_model']['doc_topic'][ str(int(row_score['tweet_id'])) ][name] = row_score[ix_5]
# topic_word
for ix_6 in range(L):
name = str(month_ix) + '_' + str(ix_6 + 1)
binDict[str(month_ix)]['topic_model']['topic_word'][name] = {}
topwords = dfTopicWord[month_ix].iloc[ix_6].sort_values(ascending=False)[:10]
# we choose top 10 words, so below the range is 10
for ix_7 in range(10):
binDict[str(month_ix)]['topic_model']['topic_word'][name][topwords.index[ix_7]] = topwords.values[ix_7]
In [478]:
# transform into json format
# then, transform into json format
json_tmp = json.dumps(binDict)
# finally, transform into .js format that TopicFlow can read
prefix = 'function populate_bins_test(){\nvar bin_data = '
posfix = ';\nreadBinJSON(bin_data);\n}'
BinsJS = prefix + json_tmp + posfix
In [479]:
# write
with open('Bins.js', 'w') as file:
file.write(BinsJS)
In [462]:
# read data set
dfTopicSim = pd.read_csv(r"E:\GoogleDrive\Projects\PERCEIVE\data\LDA_VEM\2012_k_10_12\Topic_Flow\topic_flow.csv")
# simDict
simDict = {}
# put topics into nodes, record their orders
nodes = []
for i in range(len(monthList)):
for j in range(1,11):
tmp = {}
name = str(i) + '_' + str(j)
value = np.random.randint(1,20) # I haven't figured out what this value means, so right now keep it random
tmp['name'], tmp['value'] = name, value
nodes.append(tmp)
# put source, target, value into links
links = []
for month_ix in range(len(monthList) - 1):
# get unique pais btw/ every two month, in this test, Apr and May
mm1, mm2 = monthList[month_ix], monthList[month_ix + 1]
sim = mm1 + '_' + mm2 + '_similarity'
df_tmp = dfTopicSim[[mm1, mm2, sim]].dropna(axis=0).drop_duplicates()
for row_ix in range(len(df_tmp)):
source = month_ix*10 + int(df_tmp[mm1].values[row_ix]) - 1
target = (month_ix+1)*10 + int(df_tmp[mm2].values[row_ix]) - 1
score = df_tmp[sim].values[row_ix] * 500 # I don't know how they came up with the values
link_tmp = {}
link_tmp['source'], link_tmp['target'], link_tmp['value'] = source, target, score
links.append(link_tmp)
# put two lists into simDict
simDict['nodes'], simDict['links'] = nodes, links
In [463]:
# transform into json format
# then, transform into json format
json_tmp = json.dumps(simDict)
# finally, transform into .js format that TopicFlow can read
prefix = 'function populate_similarity_test(){\nvar sim_data = '
posfix = ';\nreadSimilarityJSON(sim_data);\n}'
TopicSimilarityJS = prefix + json_tmp + posfix
In [465]:
# write
with open('TopicSimilarity.js', 'w') as file:
file.write(TopicSimilarityJS)